import shap
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from skopt import BayesSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import plot_roc_curve
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import f1_score, classification_report, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import mutual_info_classif as MIC
from sklearn import set_config
set_config(display="diagram")
warnings.filterwarnings('ignore')
pd.set_option('max_colwidth', 1000)
data = pd.read_csv("dataset.csv")
data.head(3)
| android.permission.GET_ACCOUNTS | com.sonyericsson.home.permission.BROADCAST_BADGE | android.permission.READ_PROFILE | android.permission.MANAGE_ACCOUNTS | android.permission.WRITE_SYNC_SETTINGS | android.permission.READ_EXTERNAL_STORAGE | android.permission.RECEIVE_SMS | com.android.launcher.permission.READ_SETTINGS | android.permission.WRITE_SETTINGS | com.google.android.providers.gsf.permission.READ_GSERVICES | ... | com.android.launcher.permission.UNINSTALL_SHORTCUT | com.sec.android.iap.permission.BILLING | com.htc.launcher.permission.UPDATE_SHORTCUT | com.sec.android.provider.badge.permission.WRITE | android.permission.ACCESS_NETWORK_STATE | com.google.android.finsky.permission.BIND_GET_INSTALL_REFERRER_SERVICE | com.huawei.android.launcher.permission.READ_SETTINGS | android.permission.READ_SMS | android.permission.PROCESS_INCOMING_CALLS | Result | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
3 rows × 87 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 29332 entries, 0 to 29331 Data columns (total 87 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 android.permission.GET_ACCOUNTS 29332 non-null int64 1 com.sonyericsson.home.permission.BROADCAST_BADGE 29332 non-null int64 2 android.permission.READ_PROFILE 29332 non-null int64 3 android.permission.MANAGE_ACCOUNTS 29332 non-null int64 4 android.permission.WRITE_SYNC_SETTINGS 29332 non-null int64 5 android.permission.READ_EXTERNAL_STORAGE 29332 non-null int64 6 android.permission.RECEIVE_SMS 29332 non-null int64 7 com.android.launcher.permission.READ_SETTINGS 29332 non-null int64 8 android.permission.WRITE_SETTINGS 29332 non-null int64 9 com.google.android.providers.gsf.permission.READ_GSERVICES 29332 non-null int64 10 android.permission.DOWNLOAD_WITHOUT_NOTIFICATION 29332 non-null int64 11 android.permission.GET_TASKS 29332 non-null int64 12 android.permission.WRITE_EXTERNAL_STORAGE 29332 non-null int64 13 android.permission.RECORD_AUDIO 29332 non-null int64 14 com.huawei.android.launcher.permission.CHANGE_BADGE 29332 non-null int64 15 com.oppo.launcher.permission.READ_SETTINGS 29332 non-null int64 16 android.permission.CHANGE_NETWORK_STATE 29332 non-null int64 17 com.android.launcher.permission.INSTALL_SHORTCUT 29332 non-null int64 18 android.permission.android.permission.READ_PHONE_STATE 29332 non-null int64 19 android.permission.CALL_PHONE 29332 non-null int64 20 android.permission.WRITE_CONTACTS 29332 non-null int64 21 android.permission.READ_PHONE_STATE 29332 non-null int64 22 com.samsung.android.providers.context.permission.WRITE_USE_APP_FEATURE_SURVEY 29332 non-null int64 23 android.permission.MODIFY_AUDIO_SETTINGS 29332 non-null int64 24 android.permission.ACCESS_LOCATION_EXTRA_COMMANDS 29332 non-null int64 25 android.permission.INTERNET 29332 non-null int64 26 android.permission.MOUNT_UNMOUNT_FILESYSTEMS 29332 non-null int64 27 com.majeur.launcher.permission.UPDATE_BADGE 29332 non-null int64 28 android.permission.AUTHENTICATE_ACCOUNTS 29332 non-null int64 29 com.htc.launcher.permission.READ_SETTINGS 29332 non-null int64 30 android.permission.ACCESS_WIFI_STATE 29332 non-null int64 31 android.permission.FLASHLIGHT 29332 non-null int64 32 android.permission.READ_APP_BADGE 29332 non-null int64 33 android.permission.USE_CREDENTIALS 29332 non-null int64 34 android.permission.CHANGE_CONFIGURATION 29332 non-null int64 35 android.permission.READ_SYNC_SETTINGS 29332 non-null int64 36 android.permission.BROADCAST_STICKY 29332 non-null int64 37 com.anddoes.launcher.permission.UPDATE_COUNT 29332 non-null int64 38 com.android.alarm.permission.SET_ALARM 29332 non-null int64 39 com.google.android.c2dm.permission.RECEIVE 29332 non-null int64 40 android.permission.KILL_BACKGROUND_PROCESSES 29332 non-null int64 41 com.sonymobile.home.permission.PROVIDER_INSERT_BADGE 29332 non-null int64 42 com.sec.android.provider.badge.permission.READ 29332 non-null int64 43 android.permission.WRITE_CALENDAR 29332 non-null int64 44 android.permission.SEND_SMS 29332 non-null int64 45 com.huawei.android.launcher.permission.WRITE_SETTINGS 29332 non-null int64 46 android.permission.REQUEST_INSTALL_PACKAGES 29332 non-null int64 47 android.permission.SET_WALLPAPER_HINTS 29332 non-null int64 48 android.permission.SET_WALLPAPER 29332 non-null int64 49 com.oppo.launcher.permission.WRITE_SETTINGS 29332 non-null int64 50 android.permission.RESTART_PACKAGES 29332 non-null int64 51 me.everything.badger.permission.BADGE_COUNT_WRITE 29332 non-null int64 52 android.permission.ACCESS_MOCK_LOCATION 29332 non-null int64 53 android.permission.ACCESS_COARSE_LOCATION 29332 non-null int64 54 android.permission.READ_LOGS 29332 non-null int64 55 com.google.android.gms.permission.ACTIVITY_RECOGNITION 29332 non-null int64 56 com.amazon.device.messaging.permission.RECEIVE 29332 non-null int64 57 android.permission.SYSTEM_ALERT_WINDOW 29332 non-null int64 58 android.permission.DISABLE_KEYGUARD 29332 non-null int64 59 android.permission.USE_FINGERPRINT 29332 non-null int64 60 me.everything.badger.permission.BADGE_COUNT_READ 29332 non-null int64 61 android.permission.CHANGE_WIFI_STATE 29332 non-null int64 62 android.permission.READ_CONTACTS 29332 non-null int64 63 com.android.vending.BILLING 29332 non-null int64 64 android.permission.READ_CALENDAR 29332 non-null int64 65 android.permission.RECEIVE_BOOT_COMPLETED 29332 non-null int64 66 android.permission.WAKE_LOCK 29332 non-null int64 67 android.permission.ACCESS_FINE_LOCATION 29332 non-null int64 68 android.permission.BLUETOOTH 29332 non-null int64 69 android.permission.CAMERA 29332 non-null int64 70 com.android.vending.CHECK_LICENSE 29332 non-null int64 71 android.permission.FOREGROUND_SERVICE 29332 non-null int64 72 android.permission.BLUETOOTH_ADMIN 29332 non-null int64 73 android.permission.VIBRATE 29332 non-null int64 74 android.permission.NFC 29332 non-null int64 75 android.permission.RECEIVE_USER_PRESENT 29332 non-null int64 76 android.permission.CLEAR_APP_CACHE 29332 non-null int64 77 com.android.launcher.permission.UNINSTALL_SHORTCUT 29332 non-null int64 78 com.sec.android.iap.permission.BILLING 29332 non-null int64 79 com.htc.launcher.permission.UPDATE_SHORTCUT 29332 non-null int64 80 com.sec.android.provider.badge.permission.WRITE 29332 non-null int64 81 android.permission.ACCESS_NETWORK_STATE 29332 non-null int64 82 com.google.android.finsky.permission.BIND_GET_INSTALL_REFERRER_SERVICE 29332 non-null int64 83 com.huawei.android.launcher.permission.READ_SETTINGS 29332 non-null int64 84 android.permission.READ_SMS 29332 non-null int64 85 android.permission.PROCESS_INCOMING_CALLS 29332 non-null int64 86 Result 29332 non-null int64 dtypes: int64(87) memory usage: 19.5 MB
All the columns have 2 unique values. 0 or 1
d = {}
for i in data.columns:
d[i] = len(data[i].unique())
pd.DataFrame(d, index=[0])
| android.permission.GET_ACCOUNTS | com.sonyericsson.home.permission.BROADCAST_BADGE | android.permission.READ_PROFILE | android.permission.MANAGE_ACCOUNTS | android.permission.WRITE_SYNC_SETTINGS | android.permission.READ_EXTERNAL_STORAGE | android.permission.RECEIVE_SMS | com.android.launcher.permission.READ_SETTINGS | android.permission.WRITE_SETTINGS | com.google.android.providers.gsf.permission.READ_GSERVICES | ... | com.android.launcher.permission.UNINSTALL_SHORTCUT | com.sec.android.iap.permission.BILLING | com.htc.launcher.permission.UPDATE_SHORTCUT | com.sec.android.provider.badge.permission.WRITE | android.permission.ACCESS_NETWORK_STATE | com.google.android.finsky.permission.BIND_GET_INSTALL_REFERRER_SERVICE | com.huawei.android.launcher.permission.READ_SETTINGS | android.permission.READ_SMS | android.permission.PROCESS_INCOMING_CALLS | Result | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | ... | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
1 rows × 87 columns
print(data['Result'].value_counts())
plt.figure(figsize=(7, 5), dpi=100)
sns.countplot(data['Result']);
1 14700 0 14632 Name: Result, dtype: int64
Dataset is well balanced
pd.DataFrame(data.isnull().sum()).T
| android.permission.GET_ACCOUNTS | com.sonyericsson.home.permission.BROADCAST_BADGE | android.permission.READ_PROFILE | android.permission.MANAGE_ACCOUNTS | android.permission.WRITE_SYNC_SETTINGS | android.permission.READ_EXTERNAL_STORAGE | android.permission.RECEIVE_SMS | com.android.launcher.permission.READ_SETTINGS | android.permission.WRITE_SETTINGS | com.google.android.providers.gsf.permission.READ_GSERVICES | ... | com.android.launcher.permission.UNINSTALL_SHORTCUT | com.sec.android.iap.permission.BILLING | com.htc.launcher.permission.UPDATE_SHORTCUT | com.sec.android.provider.badge.permission.WRITE | android.permission.ACCESS_NETWORK_STATE | com.google.android.finsky.permission.BIND_GET_INSTALL_REFERRER_SERVICE | com.huawei.android.launcher.permission.READ_SETTINGS | android.permission.READ_SMS | android.permission.PROCESS_INCOMING_CALLS | Result | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 rows × 87 columns
from random import random
X = data[data.columns[:-1]]
y = data['Result']
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.15,
stratify=y,
random_state = 121)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((24932, 86), (4400, 86), (24932,), (4400,))
# Before feature selection.
plt.figure(figsize=(8, 5), dpi=100)
sns.heatmap(X.corr(), cmap='Purples', xticklabels=False, yticklabels=False);
#logistic regression (no tuning - no feature selection - only defaults)
def logistic(xtrain, ytrain, xtest, ytest):
print("Logistic Regression --->\n")
log_reg = LogisticRegression().fit(xtrain, ytrain)
log_reg_pred = log_reg.predict(xtest)
print(f"F1-Score : {f1_score(ytest, log_reg_pred):.3f}")
print(f"AUC - ROC Score : {roc_auc_score(ytest, log_reg.predict_proba(xtest)[:, 1]):.3f}", end="\n\n")
print(classification_report(ytest, log_reg_pred))
logistic(X_train, y_train, X_test, y_test)
Logistic Regression --->
F1-Score : 0.956
AUC - ROC Score : 0.985
precision recall f1-score support
0 0.96 0.95 0.96 2195
1 0.95 0.96 0.96 2205
accuracy 0.96 4400
macro avg 0.96 0.96 0.96 4400
weighted avg 0.96 0.96 0.96 4400
# Support vector machines (no tuning - no feature selection - only defaults)
def support_vector(xtrain, ytrain, xtest, ytest):
print("Support Vector Machines --->\n")
svm = SVC(probability=True).fit(xtrain, ytrain)
svm_pred = svm.predict(xtest)
print(f"F1-Score : {f1_score(ytest, svm_pred):.3f}")
print(f"AUC - ROC Score : {roc_auc_score(ytest, svm.predict_proba(xtest)[:, 1]):.3f}", end="\n\n")
print(classification_report(ytest, svm_pred))
support_vector(X_train, y_train, X_test, y_test)
Support Vector Machines --->
F1-Score : 0.961
AUC - ROC Score : 0.989
precision recall f1-score support
0 0.97 0.96 0.96 2195
1 0.96 0.97 0.96 2205
accuracy 0.96 4400
macro avg 0.96 0.96 0.96 4400
weighted avg 0.96 0.96 0.96 4400
# Knn (no tuning - no feature selection - only defaults)
def knn(xtrain, ytrain, xtest, ytest):
print("K Nearest Neighbors --->\n")
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(xtrain, ytrain)
neigh_pred = neigh.predict(xtest)
print(f"F1-Score : {f1_score(ytest, neigh_pred):.3f}")
print(f"AUC - ROC Score : {roc_auc_score(ytest, neigh.predict_proba(xtest)[:, 1]):.3f}", end="\n\n")
print(classification_report(ytest, neigh_pred))
knn(X_train, y_train, X_test, y_test)
K Nearest Neighbors --->
F1-Score : 0.959
AUC - ROC Score : 0.981
precision recall f1-score support
0 0.96 0.95 0.96 2195
1 0.95 0.97 0.96 2205
accuracy 0.96 4400
macro avg 0.96 0.96 0.96 4400
weighted avg 0.96 0.96 0.96 4400
# Random forests (no tuning - no feature selection - only defaults)
def random_forest(xtrain, ytrain, xtest, ytest):
print("Random Forests --->\n")
rf = RandomForestClassifier(n_estimators=300, random_state=0)
rf.fit(xtrain, ytrain)
rf_pred = rf.predict(xtest)
print(f"F1-Score : {f1_score(ytest, rf_pred):.3f}")
print(f"AUC - ROC Score : {roc_auc_score(ytest, rf.predict_proba(xtest)[:, 1]):.3f}", end="\n\n")
print(classification_report(ytest, rf_pred))
random_forest(X_train, y_train, X_test, y_test)
Random Forests --->
F1-Score : 0.969
AUC - ROC Score : 0.992
precision recall f1-score support
0 0.97 0.97 0.97 2195
1 0.97 0.97 0.97 2205
accuracy 0.97 4400
macro avg 0.97 0.97 0.97 4400
weighted avg 0.97 0.97 0.97 4400
# XGBoost (no tuning - no feature selection - only defaults)
def xgboost_clf(xtrain, ytrain, xtest, ytest):
print("XGBoost --->\n")
xg = XGBClassifier(eval_metric='auc', random_state=101).fit(xtrain, ytrain)
xg_pred = xg.predict(xtest)
print(f"F1-Score : {f1_score(ytest, xg_pred):.3f}")
print(f"AUC - ROC Score : {roc_auc_score(ytest, xg.predict_proba(xtest)[:, 1]):.3f}", end="\n\n")
print(classification_report(ytest, xg_pred))
xgboost_clf(X_train, y_train, X_test, y_test)
XGBoost --->
F1-Score : 0.965
AUC - ROC Score : 0.992
precision recall f1-score support
0 0.96 0.97 0.96 2195
1 0.97 0.96 0.97 2205
accuracy 0.96 4400
macro avg 0.96 0.97 0.96 4400
weighted avg 0.97 0.96 0.97 4400
# CatBoost (no tuning - no feature selection - only defaults)
def catboost_clf(xtrain, ytrain, xtest, ytest):
print("CatBoost --->\n")
cat = CatBoostClassifier(loss_function='Logloss',
verbose=False)
cat.fit(X_train, y_train, plot=True)
test = Pool(xtest, ytest)
cat_pred = cat.predict(test)
print(f"F1-Score : {f1_score(ytest, cat_pred):.3f}")
print(f"AUC - ROC Score : {roc_auc_score(ytest, cat.predict_proba(xtest)[:, 1]):.3f}", end="\n\n")
print(classification_report(ytest, cat_pred))
catboost_clf(X_train, y_train, X_test, y_test)
CatBoost --->
MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))
F1-Score : 0.966
AUC - ROC Score : 0.993
precision recall f1-score support
0 0.97 0.97 0.97 2195
1 0.97 0.97 0.97 2205
accuracy 0.97 4400
macro avg 0.97 0.97 0.97 4400
weighted avg 0.97 0.97 0.97 4400
# Voting classifier - Random Forest, XGBoost and CatBoost
def vote_clf(xtrain, ytrain, xtest, ytest):
print("Voting Classifier --->\n")
ensemble = VotingClassifier(
estimators=[
("random_forest", RandomForestClassifier(n_estimators=300, random_state=0)),
("xgb", XGBClassifier(eval_metric='auc', random_state=101)),
("catboost", CatBoostClassifier(loss_function='Logloss', verbose=False))
],
voting="soft",
n_jobs=-1
)
ensemble.fit(xtrain, ytrain)
e_pred = ensemble.predict(xtest)
print(f"\nF1-Score : {f1_score(ytest, e_pred):.3f}")
print(f"AUC - ROC Score : {roc_auc_score(ytest, ensemble.predict_proba(xtest)[:, 1]):.3f}", end="\n\n")
print(classification_report(ytest, e_pred))
vote_clf(X_train, y_train, X_test, y_test)
Voting Classifier --->
F1-Score : 0.967
AUC - ROC Score : 0.993
precision recall f1-score support
0 0.96 0.97 0.97 2195
1 0.97 0.96 0.97 2205
accuracy 0.97 4400
macro avg 0.97 0.97 0.97 4400
weighted avg 0.97 0.97 0.97 4400
# Stacking classifier - Logistic Regression, SVM, KNN, Random Forest, XGBoost and CatBoost.
def stacking_clf(xtrain, ytrain, xtest, ytest):
print("Stacking Classifier --->\n")
ensemble = StackingClassifier(
estimators=[
("LR", LogisticRegression()),
("SVM", SVC(probability=True)),
("KNN", KNeighborsClassifier(n_neighbors=5)),
("random_forest", RandomForestClassifier(n_estimators=300, random_state=0)),
("xgb", XGBClassifier(eval_metric='auc', random_state=101)),
("catboost", CatBoostClassifier(loss_function='Logloss', verbose=False))
],
final_estimator=RandomForestClassifier(n_estimators=300, random_state=0),
cv=5,
passthrough=False,
n_jobs=-1
)
ensemble.fit(xtrain, ytrain)
e_pred = ensemble.predict(xtest)
print(f"F1-Score : {f1_score(ytest, e_pred):.3f}")
print(f"AUC - ROC Score : {roc_auc_score(ytest, ensemble.predict_proba(xtest)[:, 1]):.3f}", end="\n\n")
print(classification_report(ytest, e_pred))
stacking_clf(X_train, y_train, X_test, y_test)
Stacking Classifier --->
F1-Score : 0.967
AUC - ROC Score : 0.992
precision recall f1-score support
0 0.96 0.97 0.97 2195
1 0.97 0.96 0.97 2205
accuracy 0.97 4400
macro avg 0.97 0.97 0.97 4400
weighted avg 0.97 0.97 0.97 4400
# Handle multicollinearity by eliminating features based on Variance Inflation Factor (VIF).
def VIF(x):
cols_remove = []
while True:
temp_x = X.drop(cols_remove, axis=1)
vf = pd.DataFrame()
vf['columns'] = temp_x.columns
vf['VIF score'] = [variance_inflation_factor(temp_x.values, i) for i in range(len(temp_x.columns))]
vf.sort_values(by='VIF score', ascending=False, inplace=True)
vf.reset_index(drop=True, inplace=True)
if vf.loc[0, 'VIF score'] > 5:
print(vf.loc[0, 'columns'], vf.loc[0, 'VIF score'])
cols_remove.append(vf.loc[0, 'columns'])
else:
break
return cols_remove
print("Columns to be removed (VIF > 5) --->")
cols_remove = VIF(X)
Columns to be removed (VIF > 6) ---> me.everything.badger.permission.BADGE_COUNT_WRITE inf com.sec.android.provider.badge.permission.READ 544.4516399895751 com.oppo.launcher.permission.WRITE_SETTINGS 186.73742628488154 com.huawei.android.launcher.permission.WRITE_SETTINGS 94.1855996984245 com.htc.launcher.permission.UPDATE_SHORTCUT 90.02778090329161 com.anddoes.launcher.permission.UPDATE_COUNT 78.72204740606233 com.huawei.android.launcher.permission.CHANGE_BADGE 42.24730372097385 android.permission.ACCESS_NETWORK_STATE 39.67262119636497 android.permission.READ_APP_BADGE 31.921291142563234 com.majeur.launcher.permission.UPDATE_BADGE 27.942467281595935 com.huawei.android.launcher.permission.READ_SETTINGS 20.40800554077491 com.sonyericsson.home.permission.BROADCAST_BADGE 19.716629104817088 android.permission.ACCESS_COARSE_LOCATION 9.442415097881028 android.permission.READ_CALENDAR 8.319298090331598 com.sec.android.provider.badge.permission.WRITE 7.58175517934947 com.oppo.launcher.permission.READ_SETTINGS 7.230427332697641 android.permission.INTERNET 5.339439869414774 android.permission.RECEIVE_SMS 5.147512049622763 android.permission.READ_PHONE_STATE 5.015054263933838
plt.figure(figsize=(8, 5), dpi=100)
sns.heatmap(X.drop(cols_remove, axis=1).corr(), cmap='Purples', xticklabels=False, yticklabels=False);
# Function to remove columns based on VIF
def remove_columns_vif(x):
x = x.drop(cols_remove, axis=1)
return x
remove_col_transformer = FunctionTransformer(remove_columns_vif)
def eval_classifier(name, pipeline):
print(f"{name} --->")
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(f"\nF1-Score : {f1_score(y_test, y_pred):.3f}")
print(f"AUC - ROC Score : {roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1]):.3f}", end="\n\n")
print(classification_report(y_test, y_pred))
# Random forests - after feature selection - no tuning.
rf_pipeline = Pipeline(
steps=[
("Feature selection", remove_col_transformer),
("Random forest classifier", RandomForestClassifier(n_estimators=300, random_state=0))
]
)
rf_pipeline
Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
('Random forest classifier',
RandomForestClassifier(n_estimators=300, random_state=0))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
('Random forest classifier',
RandomForestClassifier(n_estimators=300, random_state=0))])FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)
RandomForestClassifier(n_estimators=300, random_state=0)
eval_classifier("Random forest pipeline", rf_pipeline)
Random forest pipeline --->
F1-Score : 0.943
AUC - ROC Score : 0.984
precision recall f1-score support
0 0.94 0.94 0.94 2195
1 0.94 0.94 0.94 2205
accuracy 0.94 4400
macro avg 0.94 0.94 0.94 4400
weighted avg 0.94 0.94 0.94 4400
xg_pipeline = Pipeline(
steps=[
("Feature selection", remove_col_transformer),
("Random forest classifier", XGBClassifier(eval_metric='auc', random_state=101))
]
)
xg_pipeline
Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
('Random forest classifier',
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric='auc',
gamma=None, gpu_id...policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=None,
max_bin=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None,
random_state=101, reg_alpha=None,
reg_lambda=None, ...))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
('Random forest classifier',
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric='auc',
gamma=None, gpu_id...policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=None,
max_bin=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None,
random_state=101, reg_alpha=None,
reg_lambda=None, ...))])FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric='auc', gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None, random_state=101,
reg_alpha=None, reg_lambda=None, ...)eval_classifier("XGBoost classifier", xg_pipeline)
XGBoost classifier --->
F1-Score : 0.945
AUC - ROC Score : 0.985
precision recall f1-score support
0 0.93 0.97 0.95 2195
1 0.97 0.92 0.94 2205
accuracy 0.95 4400
macro avg 0.95 0.95 0.95 4400
weighted avg 0.95 0.95 0.95 4400
cat_pipeline = Pipeline(
steps=[
("Feature selection", remove_col_transformer),
("CatBoost classifier", CatBoostClassifier(loss_function='Logloss', verbose=False))
]
)
cat_pipeline
Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
('CatBoost classifier',
<catboost.core.CatBoostClassifier object at 0x000002F486336D60>)])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
('CatBoost classifier',
<catboost.core.CatBoostClassifier object at 0x000002F486336D60>)])FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)
<catboost.core.CatBoostClassifier object at 0x000002F486336D60>
eval_classifier("CatBoost classifier", cat_pipeline)
CatBoost classifier --->
F1-Score : 0.946
AUC - ROC Score : 0.986
precision recall f1-score support
0 0.93 0.97 0.95 2195
1 0.97 0.92 0.95 2205
accuracy 0.95 4400
macro avg 0.95 0.95 0.95 4400
weighted avg 0.95 0.95 0.95 4400
voting_pipeline = Pipeline(
steps=[
("Feature selection", remove_col_transformer),
("Voting classifier", VotingClassifier(
estimators=[
("random_forest", RandomForestClassifier(n_estimators=300, random_state=0)),
("xgb", XGBClassifier(eval_metric='auc', random_state=101)),
("catboost", CatBoostClassifier(loss_function='Logloss', verbose=False))
],
voting="soft",
n_jobs=-1
))
]
)
voting_pipeline
Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
('Voting classifier',
VotingClassifier(estimators=[('random_forest',
RandomForestClassifier(n_estimators=300,
random_state=0)),
('xgb',
XGBClassifier(base_score=None,
booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsa...
max_cat_to_onehot=None,
max_delta_step=None,
max_depth=None,
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
n_estimators=100,
n_jobs=None,
num_parallel_tree=None,
predictor=None,
random_state=101,
reg_alpha=None,
reg_lambda=None, ...)),
('catboost',
<catboost.core.CatBoostClassifier object at 0x000002F48A6CF370>)],
n_jobs=-1, voting='soft'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
('Voting classifier',
VotingClassifier(estimators=[('random_forest',
RandomForestClassifier(n_estimators=300,
random_state=0)),
('xgb',
XGBClassifier(base_score=None,
booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsa...
max_cat_to_onehot=None,
max_delta_step=None,
max_depth=None,
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
n_estimators=100,
n_jobs=None,
num_parallel_tree=None,
predictor=None,
random_state=101,
reg_alpha=None,
reg_lambda=None, ...)),
('catboost',
<catboost.core.CatBoostClassifier object at 0x000002F48A6CF370>)],
n_jobs=-1, voting='soft'))])FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)
VotingClassifier(estimators=[('random_forest',
RandomForestClassifier(n_estimators=300,
random_state=0)),
('xgb',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric='auc', gamma=None,
gpu_id=None, grow_policy=None,
impor...
max_cat_to_onehot=None,
max_delta_step=None, max_depth=None,
max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None,
predictor=None, random_state=101,
reg_alpha=None, reg_lambda=None, ...)),
('catboost',
<catboost.core.CatBoostClassifier object at 0x000002F48A6CF370>)],
n_jobs=-1, voting='soft')RandomForestClassifier(n_estimators=300, random_state=0)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric='auc', gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None, random_state=101,
reg_alpha=None, reg_lambda=None, ...)<catboost.core.CatBoostClassifier object at 0x000002F48A6CF370>
eval_classifier("Voting classifier", voting_pipeline)
Voting classifier --->
F1-Score : 0.946
AUC - ROC Score : 0.986
precision recall f1-score support
0 0.93 0.97 0.95 2195
1 0.97 0.93 0.95 2205
accuracy 0.95 4400
macro avg 0.95 0.95 0.95 4400
weighted avg 0.95 0.95 0.95 4400
stacking_pipeline = Pipeline(
steps=[
("Feature selection", remove_col_transformer),
("Stacking classifier", StackingClassifier(
estimators=[
("LR", LogisticRegression()),
("SVM", SVC(probability=True)),
("KNN", KNeighborsClassifier(n_neighbors=5)),
("random_forest", RandomForestClassifier(n_estimators=300, random_state=0)),
("xgb", XGBClassifier(eval_metric='auc', random_state=101)),
("catboost", CatBoostClassifier(loss_function='Logloss', verbose=False))
],
final_estimator=CatBoostClassifier(loss_function='Logloss', verbose=False),
cv=5,
passthrough=False,
n_jobs=-1
))
]
)
stacking_pipeline
Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
('Stacking classifier',
StackingClassifier(cv=5,
estimators=[('LR', LogisticRegression()),
('SVM', SVC(probability=True)),
('KNN', KNeighborsClassifier()),
('random_forest',
RandomForestClassifier(n_estimators=300,
random_state=0)),
('xgb',
XGBClassif...
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
n_estimators=100,
n_jobs=None,
num_parallel_tree=None,
predictor=None,
random_state=101,
reg_alpha=None,
reg_lambda=None, ...)),
('catboost',
<catboost.core.CatBoostClassifier object at 0x000002F48820E760>)],
final_estimator=<catboost.core.CatBoostClassifier object at 0x000002F48820E610>,
n_jobs=-1))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
('Stacking classifier',
StackingClassifier(cv=5,
estimators=[('LR', LogisticRegression()),
('SVM', SVC(probability=True)),
('KNN', KNeighborsClassifier()),
('random_forest',
RandomForestClassifier(n_estimators=300,
random_state=0)),
('xgb',
XGBClassif...
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
n_estimators=100,
n_jobs=None,
num_parallel_tree=None,
predictor=None,
random_state=101,
reg_alpha=None,
reg_lambda=None, ...)),
('catboost',
<catboost.core.CatBoostClassifier object at 0x000002F48820E760>)],
final_estimator=<catboost.core.CatBoostClassifier object at 0x000002F48820E610>,
n_jobs=-1))])FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)
StackingClassifier(cv=5,
estimators=[('LR', LogisticRegression()),
('SVM', SVC(probability=True)),
('KNN', KNeighborsClassifier()),
('random_forest',
RandomForestClassifier(n_estimators=300,
random_state=0)),
('xgb',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_roun...
max_depth=None, max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None,
predictor=None, random_state=101,
reg_alpha=None, reg_lambda=None, ...)),
('catboost',
<catboost.core.CatBoostClassifier object at 0x000002F48820E760>)],
final_estimator=<catboost.core.CatBoostClassifier object at 0x000002F48820E610>,
n_jobs=-1)LogisticRegression()
SVC(probability=True)
KNeighborsClassifier()
RandomForestClassifier(n_estimators=300, random_state=0)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric='auc', gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None, random_state=101,
reg_alpha=None, reg_lambda=None, ...)<catboost.core.CatBoostClassifier object at 0x000002F48820E760>
<catboost.core.CatBoostClassifier object at 0x000002F48820E610>
eval_classifier("Stacking classifier", stacking_pipeline)
Stacking classifier --->
F1-Score : 0.942
AUC - ROC Score : 0.986
precision recall f1-score support
0 0.94 0.95 0.94 2195
1 0.95 0.94 0.94 2205
accuracy 0.94 4400
macro avg 0.94 0.94 0.94 4400
weighted avg 0.94 0.94 0.94 4400
mi_score = MIC(X, y)
mi_cols_remove_ind = np.where(mi_score < 0.001)
mi_cols_remove = X.columns[mi_cols_remove_ind[0]]
mi_cols_remove
Index(['android.permission.WRITE_SYNC_SETTINGS',
'android.permission.AUTHENTICATE_ACCOUNTS',
'android.permission.FLASHLIGHT',
'android.permission.READ_SYNC_SETTINGS',
'android.permission.BROADCAST_STICKY',
'android.permission.SET_WALLPAPER_HINTS',
'android.permission.ACCESS_MOCK_LOCATION',
'com.google.android.gms.permission.ACTIVITY_RECOGNITION',
'com.android.vending.CHECK_LICENSE', 'android.permission.VIBRATE',
'android.permission.RECEIVE_USER_PRESENT'],
dtype='object')
def MI_remover(x):
x = x.drop(mi_cols_remove, axis=1)
return x
mi_remove_col_transformer = FunctionTransformer(MI_remover)
rf_pipeline = Pipeline(
steps=[
("Feature selection", mi_remove_col_transformer),
("Random forest classifier", RandomForestClassifier(n_estimators=300, random_state=0))
]
)
rf_pipeline
Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function MI_remover at 0x000002F487434D30>)),
('Random forest classifier',
RandomForestClassifier(n_estimators=300, random_state=0))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function MI_remover at 0x000002F487434D30>)),
('Random forest classifier',
RandomForestClassifier(n_estimators=300, random_state=0))])FunctionTransformer(func=<function MI_remover at 0x000002F487434D30>)
RandomForestClassifier(n_estimators=300, random_state=0)
eval_classifier("Random forest pipeline", rf_pipeline)
Random forest pipeline --->
F1-Score : 0.965
AUC - ROC Score : 0.991
precision recall f1-score support
0 0.96 0.97 0.96 2195
1 0.97 0.96 0.96 2205
accuracy 0.96 4400
macro avg 0.96 0.96 0.96 4400
weighted avg 0.96 0.96 0.96 4400
xg_pipeline = Pipeline(
steps=[
("Feature selection", mi_remove_col_transformer),
("Random forest classifier", XGBClassifier(eval_metric='auc', random_state=101))
]
)
xg_pipeline
Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function MI_remover at 0x000002F487434D30>)),
('Random forest classifier',
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric='auc',
gamma=None, gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=None,
max_bin=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None,
random_state=101, reg_alpha=None,
reg_lambda=None, ...))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function MI_remover at 0x000002F487434D30>)),
('Random forest classifier',
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric='auc',
gamma=None, gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=None,
max_bin=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None,
random_state=101, reg_alpha=None,
reg_lambda=None, ...))])FunctionTransformer(func=<function MI_remover at 0x000002F487434D30>)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric='auc', gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None, random_state=101,
reg_alpha=None, reg_lambda=None, ...)eval_classifier("XGBoost classifier", xg_pipeline)
XGBoost classifier --->
F1-Score : 0.963
AUC - ROC Score : 0.991
precision recall f1-score support
0 0.96 0.97 0.96 2195
1 0.97 0.96 0.96 2205
accuracy 0.96 4400
macro avg 0.96 0.96 0.96 4400
weighted avg 0.96 0.96 0.96 4400
cat_pipeline = Pipeline(
steps=[
("Feature selection", mi_remove_col_transformer),
("CatBoost classifier", CatBoostClassifier(loss_function='Logloss', verbose=False))
]
)
cat_pipeline
Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)),
('CatBoost classifier',
<catboost.core.CatBoostClassifier object at 0x000002F4874429A0>)])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)),
('CatBoost classifier',
<catboost.core.CatBoostClassifier object at 0x000002F4874429A0>)])FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)
<catboost.core.CatBoostClassifier object at 0x000002F4874429A0>
eval_classifier("CatBoost classifier", cat_pipeline)
CatBoost classifier --->
F1-Score : 0.963
AUC - ROC Score : 0.993
precision recall f1-score support
0 0.97 0.96 0.96 2195
1 0.96 0.97 0.96 2205
accuracy 0.96 4400
macro avg 0.96 0.96 0.96 4400
weighted avg 0.96 0.96 0.96 4400
voting_pipeline = Pipeline(
steps=[
("Feature selection", mi_remove_col_transformer),
("Voting classifier", VotingClassifier(
estimators=[
("random_forest", RandomForestClassifier(n_estimators=300, random_state=0)),
("xgb", XGBClassifier(eval_metric='auc', random_state=101)),
("catboost", CatBoostClassifier(loss_function='Logloss', verbose=False))
],
voting="soft",
n_jobs=-1
))
]
)
voting_pipeline
Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)),
('Voting classifier',
VotingClassifier(estimators=[('random_forest',
RandomForestClassifier(n_estimators=300,
random_state=0)),
('xgb',
XGBClassifier(base_score=None,
booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_byt...
max_cat_to_onehot=None,
max_delta_step=None,
max_depth=None,
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
n_estimators=100,
n_jobs=None,
num_parallel_tree=None,
predictor=None,
random_state=101,
reg_alpha=None,
reg_lambda=None, ...)),
('catboost',
<catboost.core.CatBoostClassifier object at 0x000002F487442400>)],
n_jobs=-1, voting='soft'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)),
('Voting classifier',
VotingClassifier(estimators=[('random_forest',
RandomForestClassifier(n_estimators=300,
random_state=0)),
('xgb',
XGBClassifier(base_score=None,
booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_byt...
max_cat_to_onehot=None,
max_delta_step=None,
max_depth=None,
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
n_estimators=100,
n_jobs=None,
num_parallel_tree=None,
predictor=None,
random_state=101,
reg_alpha=None,
reg_lambda=None, ...)),
('catboost',
<catboost.core.CatBoostClassifier object at 0x000002F487442400>)],
n_jobs=-1, voting='soft'))])FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)
VotingClassifier(estimators=[('random_forest',
RandomForestClassifier(n_estimators=300,
random_state=0)),
('xgb',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric='auc', gamma=None,
gpu_id=None, grow_policy=None,
impor...
max_cat_to_onehot=None,
max_delta_step=None, max_depth=None,
max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None,
predictor=None, random_state=101,
reg_alpha=None, reg_lambda=None, ...)),
('catboost',
<catboost.core.CatBoostClassifier object at 0x000002F487442400>)],
n_jobs=-1, voting='soft')RandomForestClassifier(n_estimators=300, random_state=0)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric='auc', gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None, random_state=101,
reg_alpha=None, reg_lambda=None, ...)<catboost.core.CatBoostClassifier object at 0x000002F487442400>
eval_classifier("Voting classifier", voting_pipeline)
Voting classifier --->
F1-Score : 0.964
AUC - ROC Score : 0.993
precision recall f1-score support
0 0.96 0.97 0.96 2195
1 0.97 0.96 0.96 2205
accuracy 0.96 4400
macro avg 0.96 0.96 0.96 4400
weighted avg 0.96 0.96 0.96 4400
stacking_pipeline = Pipeline(
steps=[
("Feature selection", mi_remove_col_transformer),
("Stacking classifier", StackingClassifier(
estimators=[
("LR", LogisticRegression()),
("SVM", SVC(probability=True)),
("KNN", KNeighborsClassifier(n_neighbors=5)),
("random_forest", RandomForestClassifier(n_estimators=300, random_state=0)),
("xgb", XGBClassifier(eval_metric='auc', random_state=101)),
("catboost", CatBoostClassifier(loss_function='Logloss', verbose=False))
],
final_estimator=CatBoostClassifier(loss_function='Logloss', verbose=False),
cv=5,
passthrough=False,
n_jobs=-1
))
]
)
stacking_pipeline
Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)),
('Stacking classifier',
StackingClassifier(cv=5,
estimators=[('LR', LogisticRegression()),
('SVM', SVC(probability=True)),
('KNN', KNeighborsClassifier()),
('random_forest',
RandomForestClassifier(n_estimators=300,
random_state=0)),
('xgb',
XGBClassifier(base...
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
n_estimators=100,
n_jobs=None,
num_parallel_tree=None,
predictor=None,
random_state=101,
reg_alpha=None,
reg_lambda=None, ...)),
('catboost',
<catboost.core.CatBoostClassifier object at 0x000002F48738A970>)],
final_estimator=<catboost.core.CatBoostClassifier object at 0x000002F48738AA30>,
n_jobs=-1))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('Feature selection',
FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)),
('Stacking classifier',
StackingClassifier(cv=5,
estimators=[('LR', LogisticRegression()),
('SVM', SVC(probability=True)),
('KNN', KNeighborsClassifier()),
('random_forest',
RandomForestClassifier(n_estimators=300,
random_state=0)),
('xgb',
XGBClassifier(base...
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
n_estimators=100,
n_jobs=None,
num_parallel_tree=None,
predictor=None,
random_state=101,
reg_alpha=None,
reg_lambda=None, ...)),
('catboost',
<catboost.core.CatBoostClassifier object at 0x000002F48738A970>)],
final_estimator=<catboost.core.CatBoostClassifier object at 0x000002F48738AA30>,
n_jobs=-1))])FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)
StackingClassifier(cv=5,
estimators=[('LR', LogisticRegression()),
('SVM', SVC(probability=True)),
('KNN', KNeighborsClassifier()),
('random_forest',
RandomForestClassifier(n_estimators=300,
random_state=0)),
('xgb',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_roun...
max_depth=None, max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None,
predictor=None, random_state=101,
reg_alpha=None, reg_lambda=None, ...)),
('catboost',
<catboost.core.CatBoostClassifier object at 0x000002F48738A970>)],
final_estimator=<catboost.core.CatBoostClassifier object at 0x000002F48738AA30>,
n_jobs=-1)LogisticRegression()
SVC(probability=True)
KNeighborsClassifier()
RandomForestClassifier(n_estimators=300, random_state=0)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric='auc', gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None, random_state=101,
reg_alpha=None, reg_lambda=None, ...)<catboost.core.CatBoostClassifier object at 0x000002F48738A970>
<catboost.core.CatBoostClassifier object at 0x000002F48738AA30>
eval_classifier("Stacking classifier", stacking_pipeline)
Stacking classifier --->
F1-Score : 0.965
AUC - ROC Score : 0.993
precision recall f1-score support
0 0.96 0.97 0.96 2195
1 0.97 0.96 0.96 2205
accuracy 0.96 4400
macro avg 0.96 0.96 0.96 4400
weighted avg 0.96 0.96 0.96 4400
X_train_bayes = X_train.drop(mi_cols_remove, axis=1)
X_test_bayes = X_test.drop(mi_cols_remove, axis=1)
bayes_search = BayesSearchCV(
estimator=RandomForestClassifier(random_state=0, n_jobs=-1),
search_spaces={
'n_estimators': (100, 200, 300, 400, 500, 1000),
'criterion': ("gini", "entropy", "log_loss"),
'max_depth': (10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None),
'min_samples_split': (2, 5, 10),
'min_samples_leaf': (1, 2, 4),
'max_features': ("sqrt", "log2"),
'bootstrap': (True, False)
},
cv=3,
n_jobs=-1,
n_points=5,
random_state=121
)
bayes_search
BayesSearchCV(cv=3, estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
n_jobs=-1, n_points=5, random_state=121,
search_spaces={'bootstrap': (True, False),
'criterion': ('gini', 'entropy', 'log_loss'),
'max_depth': (10, 20, 30, 40, 50, 60, 70, 80, 90,
100, None),
'max_features': ('sqrt', 'log2'),
'min_samples_leaf': (1, 2, 4),
'min_samples_split': (2, 5, 10),
'n_estimators': (100, 200, 300, 400, 500, 1000)})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. BayesSearchCV(cv=3, estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
n_jobs=-1, n_points=5, random_state=121,
search_spaces={'bootstrap': (True, False),
'criterion': ('gini', 'entropy', 'log_loss'),
'max_depth': (10, 20, 30, 40, 50, 60, 70, 80, 90,
100, None),
'max_features': ('sqrt', 'log2'),
'min_samples_leaf': (1, 2, 4),
'min_samples_split': (2, 5, 10),
'n_estimators': (100, 200, 300, 400, 500, 1000)})RandomForestClassifier(n_jobs=-1, random_state=0)
RandomForestClassifier(n_jobs=-1, random_state=0)
bayes_search.fit(X_train_bayes, y_train)
BayesSearchCV(cv=3, estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
n_jobs=-1, n_points=5, random_state=121,
search_spaces={'bootstrap': (True, False),
'criterion': ('gini', 'entropy', 'log_loss'),
'max_depth': (10, 20, 30, 40, 50, 60, 70, 80, 90,
100, None),
'max_features': ('sqrt', 'log2'),
'min_samples_leaf': (1, 2, 4),
'min_samples_split': (2, 5, 10),
'n_estimators': (100, 200, 300, 400, 500, 1000)})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. BayesSearchCV(cv=3, estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
n_jobs=-1, n_points=5, random_state=121,
search_spaces={'bootstrap': (True, False),
'criterion': ('gini', 'entropy', 'log_loss'),
'max_depth': (10, 20, 30, 40, 50, 60, 70, 80, 90,
100, None),
'max_features': ('sqrt', 'log2'),
'min_samples_leaf': (1, 2, 4),
'min_samples_split': (2, 5, 10),
'n_estimators': (100, 200, 300, 400, 500, 1000)})RandomForestClassifier(n_jobs=-1, random_state=0)
RandomForestClassifier(n_jobs=-1, random_state=0)
bayes_search.best_params_
OrderedDict([('bootstrap', False),
('criterion', 'gini'),
('max_depth', 80),
('max_features', 'log2'),
('min_samples_leaf', 1),
('min_samples_split', 5),
('n_estimators', 400)])
rf_classifier = RandomForestClassifier( n_estimators=400,
criterion='gini',
max_depth=80,
max_features='log2',
min_samples_leaf=1,
min_samples_split=5,
bootstrap=False,
random_state=0)
rf_classifier.fit(X_train_bayes, y_train)
y_pred = rf_classifier.predict(X_test_bayes)
print(f"\nF1-Score : {f1_score(y_test, y_pred):.3f}")
print(f"AUC - ROC Score : {roc_auc_score(y_test, rf_classifier.predict_proba(X_test_bayes)[:, 1]):.3f}", end="\n\n")
print(classification_report(y_test, y_pred))
F1-Score : 0.966
AUC - ROC Score : 0.991
precision recall f1-score support
0 0.96 0.97 0.97 2195
1 0.97 0.96 0.97 2205
accuracy 0.97 4400
macro avg 0.97 0.97 0.97 4400
weighted avg 0.97 0.97 0.97 4400
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 5), dpi=100)
sns.heatmap(cm, annot=True, fmt=".5g", cmap='Blues');
plot_roc_curve(rf_classifier, X_test_bayes, y_test);
<Figure size 1000x400 with 0 Axes>
xxx = X_test_bayes.reset_index(drop=True).loc[0:300, :]
explainer = shap.Explainer(rf_classifier.predict, xxx)
shap_values = explainer(xxx)
Permutation explainer: 302it [02:25, 1.96it/s]
shap.plots.waterfall(shap_values[0], max_display=xxx.shape[1])
shap.initjs()
shap.plots.force(shap_values[0])
shap.initjs()
shap.plots.force(shap_values)
shap.plots.beeswarm(shap_values, max_display=xxx.shape[1])
shap.plots.bar(shap_values, max_display=xxx.shape[1])
shap.summary_plot(shap_values, xxx, max_display=xxx.shape[1])